##################
##################
##
## Python Preprocessing for Blaydes, Grimmer, and McQueen "Mirrors for the Princes and Sultans"
##
##################
##################


import re, os
from nltk import PorterStemmer

stemmer= PorterStemmer()


file_short =  open('ShortBooks.csv', 'r')
starts = file_short.readlines()

##first thing we're going to do is count the terms
##according to how often they appear in documents
##we will do further pruning in R

dict = {}
for z in range(1, len(starts)):
    ee = starts[z].split(',')[-1].strip('\n')
    f = open(ee, 'r')
    used = []
    text = ' '.join(f.readlines())
    text = text.lower()
    text = re.sub('\W', ' ', text)
    splittext = text.split()
    splittext = map(lambda x: stemmer.stem(x), splittext)
    for word in splittext:
	if word in dict and word not in used:
	    dict[word]+=1
	    used.append(word)
	if word not in dict and word not in used:
	    dict[word]=1
	    used.append(word)




ester = dict.values()
ester2 = dict.keys()
rfile = open('ShortBookValueKeys.txt', 'w')
rfile.write('stem,count')
rfile.write('\n')
for j in range(len(ester2)):
    rfile.write('%s,%s' %(ester2[j],ester[j]))
    rfile.write('\n')






file_short =  open('ShortBooks.csv', 'r')
starts = file_short.readlines()


clean_starts = []


for z in range(len(starts)):
    ee = re.findall('truelaw|alchemy|consolationforruler', starts[z].split(',')[0].lower())
    if len(ee)==0:
        clean_starts.append(starts[z])

##creating a new file to load with the book information

files = open('CreateBookDictionary.csv', 'r')
out = files.readlines()

splits = out[0].split('\r')

dict_books = {}

for m in range(len(splits)):
    ee = splits[m].split(',')
    dict_books[str(ee[0])] = str(ee[1])


#creating the new short books

new_files = open('RevisedShortBooks.csv', 'w')

new_files.write('fullBook,bookID,shortFile,fullFile')
new_files.write('\n')


for z in range(1, len(clean_starts)):
    ee = clean_starts[z].strip('\n')
    ee = ee.split(',')
    doc_id = dict_books[ee[0]]
    new_files.write('%s,%s,%s,%s' %(ee[0], doc_id, ee[1], ee[2]))
    new_files.write('\n')


new_files.close()





key1 = open('ShortFinalKeys.txt', 'r')
key_use = key1.readlines()
clean_key = []
for z in range(1, len(key_use)):
    clean_key.append(key_use[z].strip('\n'))
  





clean_text = ''
for word in clean_key:
    clean_text+= str(word)
    #if word != clean_key[-1]:
    clean_text+= ','

##adding the capitalized Lord

clean_text += 'lord_cap'


term_doc = open('ShortTermDoc.csv', 'w')
term_doc.write(clean_text)
term_doc.write('\n')


for j in range(1, len(clean_starts)):
    ee = clean_starts[j].split(',')[-1].strip('\n')
    f = open(ee, 'r')
    text = ' '.join(f.readlines())
    lord_cap = len(re.findall('Lord', text))
    text = re.sub('Lord', ' ', text)
    text = text.lower()
    text = re.sub('\W', ' ', text)
    splittext = text.split()
    splittext = map(lambda x: stemmer.stem(x), splittext)
    dict = {}
    for word in splittext:
	if word in dict:
	    dict[word]+=1
	    #used.append(word)
	if word not in dict:
	    dict[word]=1
            # used.append(word)
    output = ''
    for word in clean_key:
        if word in dict:
            output+= str(dict[word])
        if word not in dict:
            output+= str(0)
        output += ','
    output += str(lord_cap)       
    term_doc.write(output)
    term_doc.write('\n')



